Efficient SBC decoding

BT's SBC synthesis

Bluetooth's SBC codec is pretty good (for a low-complexity royalty-free codec, that is). Implementing it is trivial (the spec shows you how) but optimizing is not. Presented here is a rather optimal C implementation, that makes it obvious how to vectorize it(if your architecture allows it) and with lots of knobs to twist. Obviously no FPU is used. A few quality settings exist. First, one can set the precision of the fixed-point math to 32, 16, or 8 bits (which, respectively used 64, 32 and 16-bit temp variables for multiplication). One can also choose to shift instead of divide, causing some errors in quieter sounds, but saving lots of cycles in machines where divides are slow.
The strange tables you see were generated by some scripts I made. they are in the bottom
I did all this in my free time in one day (May 26). This code is free to use for non-commercial purposes (for commercial purposes, contact me). Any copies of the code must include this license and credit to me with them at all times.

#define QUALITY_LOWEST	1	//you may notice the quality reduction
#define QUALITY_MEDIUM	2	//pretty good
#define QUALITY_GREAT	3	//as good as it will get without an FPU


///config options begin


#define QUALITY	QUALITY_GREAT	
//#define SPEED_OVER_ACCURACY			//set to cheat a bit with shifts (saves a divide per sample)
#define ITER		uint8_t			//iterator up to 180 use fastest type for your platform

///config options end


#if QUALITY == QUALITY_LOWEST

	#define CONST(x)		(x >> 24)
	#define SAMPLE_CVT(x)		(x >> 8)
	#define INSAMPLE		int8_t
	#define OUTSAMPLE		uint8_t	//no point producing 16-bit samples using the 8-bit decoder
	#define FIXED			int8_t
	#define FIXED_S			int16_t
	#define OUT_CLIP_MAX		0x7F
	#define OUT_CLIP_MIN		-0x80

	#define NUM_FRAC_BITS_PROTO	8
	#define NUM_FRAC_BITS_COS	6

#elif QUALITY == QUALITY_MEDIUM

	#define CONST(x)		(x >> 16)
	#define SAMPLE_CVT(x)		(x)
	#define INSAMPLE		int16_t
	#define OUTSAMPLE		uint16_t
	#define FIXED			int16_t
	#define FIXED_S			int32_t
	#define OUT_CLIP_MAX		0x7FFF
	#define OUT_CLIP_MIN		-0x8000

	#define NUM_FRAC_BITS_PROTO	16
	#define NUM_FRAC_BITS_COS	14

#elif QUALITY == QUALITY_GREAT

	#define CONST(x)		(x)
	#define SAMPLE_CVT(x)		(x)
	#define INSAMPLE		int16_t
	#define OUTSAMPLE		uint16_t
	#define FIXED			int32_t
	#define FIXED_S			int64_t
	#define OUT_CLIP_MAX		0x7FFF
	#define OUT_CLIP_MIN		-0x8000

	#define NUM_FRAC_BITS_PROTO	32
	#define NUM_FRAC_BITS_COS	30

#else

	#error "You did not define SBC decoder synthesizer quality to use"

#endif

#


static const FIXED proto_4_40[] =
{
	CONST(0x00000000), CONST(0x00FB7991), CONST(0x02CB3E8B), CONST(0x069FDC59),
	CONST(0x22B63DA5), CONST(0x4B583FE6), CONST(0xDD49C25B), CONST(0x069FDC59),
	CONST(0xFD34C175), CONST(0x00FB7991), CONST(0x002329CC), CONST(0x00FF11CA),
	CONST(0x053B7546), CONST(0x0191E578), CONST(0x31EAB920), CONST(0x4825E4A3),
	CONST(0xEC1F5E6D), CONST(0x083DDC80), CONST(0xFF3773A8), CONST(0x00B32807),
	CONST(0x0061C5A7), CONST(0x007A4737), CONST(0x07646684), CONST(0xF89F23A7),
	CONST(0x3F23948D), CONST(0x3F23948D), CONST(0xF89F23A7), CONST(0x07646684),
	CONST(0x007A4737), CONST(0x0061C5A7), CONST(0x00B32807), CONST(0xFF3773A8),
	CONST(0x083DDC80), CONST(0xEC1F5E6D), CONST(0x4825E4A3), CONST(0x31EAB920),
	CONST(0x0191E578), CONST(0x053B7546), CONST(0x00FF11CA), CONST(0x002329CC)
};

static const FIXED proto_8_80[] =
{
	CONST(0x00000000), CONST(0x0083D8D4), CONST(0x0172E691), CONST(0x034FD9E0),
	CONST(0x116860F5), CONST(0x259ED8EB), CONST(0xEE979F0B), CONST(0x034FD9E0),
	CONST(0xFE8D196F), CONST(0x0083D8D4), CONST(0x000A42E6), CONST(0x0089DE90),
	CONST(0x020E372C), CONST(0x02447D75), CONST(0x153E7D35), CONST(0x253844DE),
	CONST(0xF2625120), CONST(0x03EBE849), CONST(0xFF1ACF26), CONST(0x0074E5CF),
	CONST(0x00167EE3), CONST(0x0082B6EC), CONST(0x02AD6794), CONST(0x00BFA1FF),
	CONST(0x18FAB36D), CONST(0x24086BF5), CONST(0xF5FF2BF8), CONST(0x04270CA8),
	CONST(0xFF93E21B), CONST(0x0060C1E9), CONST(0x002458FC), CONST(0x0069F16C),
	CONST(0x03436717), CONST(0xFEBDD6E5), CONST(0x1C7762DF), CONST(0x221D9DE0),
	CONST(0xF950DCFC), CONST(0x0412523E), CONST(0xFFF44825), CONST(0x004AB4C5),
	CONST(0x0035FF13), CONST(0x003B1FA4), CONST(0x03C04499), CONST(0xFC4086B8),
	CONST(0x1F8E43F2), CONST(0x1F8E43F2), CONST(0xFC4086B8), CONST(0x03C04499),
	CONST(0x003B1FA4), CONST(0x0035FF13), CONST(0x004AB4C5), CONST(0xFFF44825),
	CONST(0x0412523E), CONST(0xF950DCFC), CONST(0x221D9DE0), CONST(0x1C7762DF),
	CONST(0xFEBDD6E5), CONST(0x03436717), CONST(0x0069F16C), CONST(0x002458FC),
	CONST(0x0060C1E9), CONST(0xFF93E21B), CONST(0x04270CA8), CONST(0xF5FF2BF8),
	CONST(0x24086BF5), CONST(0x18FAB36D), CONST(0x00BFA1FF), CONST(0x02AD6794),
	CONST(0x0082B6EC), CONST(0x00167EE3), CONST(0x0074E5CF), CONST(0xFF1ACF26),
	CONST(0x03EBE849), CONST(0xF2625120), CONST(0x253844DE), CONST(0x153E7D35),
	CONST(0x02447D75), CONST(0x020E372C), CONST(0x0089DE90), CONST(0x000A42E6)
};

static const FIXED costab_4[] =
{
	CONST(0x2D413CCD), CONST(0xD2BEC333), CONST(0xD2BEC333), CONST(0x2D413CCD),
	CONST(0x187DE2A7), CONST(0xC4DF2862), CONST(0x3B20D79E), CONST(0xE7821D59),
	CONST(0x00000000), CONST(0x00000000), CONST(0x00000000), CONST(0x00000000),
	CONST(0xE7821D59), CONST(0x3B20D79E), CONST(0xC4DF2862), CONST(0x187DE2A7),
	CONST(0xD2BEC333), CONST(0x2D413CCD), CONST(0x2D413CCD), CONST(0xD2BEC333),
	CONST(0xC4DF2862), CONST(0xE7821D59), CONST(0x187DE2A7), CONST(0x3B20D79E),
	CONST(0xC0000000), CONST(0xC0000000), CONST(0xC0000000), CONST(0xC0000000),
	CONST(0xC4DF2862), CONST(0xE7821D59), CONST(0x187DE2A7), CONST(0x3B20D79E)
};

static const FIXED costab_8[] =
{
	CONST(0x2D413CCD), CONST(0xD2BEC333), CONST(0xD2BEC333), CONST(0x2D413CCD),
	CONST(0x2D413CCD), CONST(0xD2BEC333), CONST(0xD2BEC333), CONST(0x2D413CCD),
	CONST(0x238E7673), CONST(0xC13AD060), CONST(0x0C7C5C1E), CONST(0x3536CC52),
	CONST(0xCAC933AE), CONST(0xF383A3E2), CONST(0x3EC52FA0), CONST(0xDC71898D),
	CONST(0x187DE2A7), CONST(0xC4DF2862), CONST(0x3B20D79E), CONST(0xE7821D59),
	CONST(0xE7821D59), CONST(0x3B20D79E), CONST(0xC4DF2862), CONST(0x187DE2A7),
	CONST(0x0C7C5C1E), CONST(0xDC71898D), CONST(0x3536CC52), CONST(0xC13AD060),
	CONST(0x3EC52FA0), CONST(0xCAC933AE), CONST(0x238E7673), CONST(0xF383A3E2),
	CONST(0x00000000), CONST(0x00000000), CONST(0x00000000), CONST(0x00000000),
	CONST(0x00000000), CONST(0x00000000), CONST(0x00000000), CONST(0x00000000),
	CONST(0xF383A3E2), CONST(0x238E7673), CONST(0xCAC933AE), CONST(0x3EC52FA0),
	CONST(0xC13AD060), CONST(0x3536CC52), CONST(0xDC71898D), CONST(0x0C7C5C1E),
	CONST(0xE7821D59), CONST(0x3B20D79E), CONST(0xC4DF2862), CONST(0x187DE2A7),
	CONST(0x187DE2A7), CONST(0xC4DF2862), CONST(0x3B20D79E), CONST(0xE7821D59),
	CONST(0xDC71898D), CONST(0x3EC52FA0), CONST(0xF383A3E2), CONST(0xCAC933AE),
	CONST(0x3536CC52), CONST(0x0C7C5C1E), CONST(0xC13AD060), CONST(0x238E7673),
	CONST(0xD2BEC333), CONST(0x2D413CCD), CONST(0x2D413CCD), CONST(0xD2BEC333),
	CONST(0xD2BEC333), CONST(0x2D413CCD), CONST(0x2D413CCD), CONST(0xD2BEC333),
	CONST(0xCAC933AE), CONST(0x0C7C5C1E), CONST(0x3EC52FA0), CONST(0x238E7673),
	CONST(0xDC71898D), CONST(0xC13AD060), CONST(0xF383A3E2), CONST(0x3536CC52),
	CONST(0xC4DF2862), CONST(0xE7821D59), CONST(0x187DE2A7), CONST(0x3B20D79E),
	CONST(0x3B20D79E), CONST(0x187DE2A7), CONST(0xE7821D59), CONST(0xC4DF2862),
	CONST(0xC13AD060), CONST(0xCAC933AE), CONST(0xDC71898D), CONST(0xF383A3E2),
	CONST(0x0C7C5C1E), CONST(0x238E7673), CONST(0x3536CC52), CONST(0x3EC52FA0),
	CONST(0xC0000000), CONST(0xC0000000), CONST(0xC0000000), CONST(0xC0000000),
	CONST(0xC0000000), CONST(0xC0000000), CONST(0xC0000000), CONST(0xC0000000),
	CONST(0xC13AD060), CONST(0xCAC933AE), CONST(0xDC71898D), CONST(0xF383A3E2),
	CONST(0x0C7C5C1E), CONST(0x238E7673), CONST(0x3536CC52), CONST(0x3EC52FA0),
	CONST(0xC4DF2862), CONST(0xE7821D59), CONST(0x187DE2A7), CONST(0x3B20D79E),
	CONST(0x3B20D79E), CONST(0x187DE2A7), CONST(0xE7821D59), CONST(0xC4DF2862),
	CONST(0xCAC933AE), CONST(0x0C7C5C1E), CONST(0x3EC52FA0), CONST(0x238E7673),
	CONST(0xDC71898D), CONST(0xC13AD060), CONST(0xF383A3E2), CONST(0x3536CC52)
};

static void synth_4(OUTSAMPLE* dst, const INSAMPLE* src, FIXED* V){

	ITER i, j;
	const FIXED* tabl = proto_4_40;
	const FIXED* costab = costab_4;

	//shift
	for(i = 79; i >= 8; i--) V[i] = V[i - 8];

	//matrix
	for(i = 0; i < 8; i++){

		FIXED_S t =	(FIXED_S)costab[0] * (FIXED_S)src[0] +
				(FIXED_S)costab[1] * (FIXED_S)src[1] +
				(FIXED_S)costab[2] * (FIXED_S)src[2] +
				(FIXED_S)costab[3] * (FIXED_S)src[3];
		costab += 4;
                V[i] = t >> NUM_FRAC_BITS_COS;
	}

	//calculate audio samples
	for(j = 0; j < 4; j++){

		OUTSAMPLE s;
		FIXED_S sample =	(FIXED_S)V[j +  0] * (FIXED_S)tabl[0] +
					(FIXED_S)V[j + 12] * (FIXED_S)tabl[1] +
					(FIXED_S)V[j + 16] * (FIXED_S)tabl[2] +
					(FIXED_S)V[j + 28] * (FIXED_S)tabl[3] +
					(FIXED_S)V[j + 32] * (FIXED_S)tabl[4] +
					(FIXED_S)V[j + 44] * (FIXED_S)tabl[5] +
					(FIXED_S)V[j + 48] * (FIXED_S)tabl[6] +
					(FIXED_S)V[j + 60] * (FIXED_S)tabl[7] +
					(FIXED_S)V[j + 64] * (FIXED_S)tabl[8] +
					(FIXED_S)V[j + 76] * (FIXED_S)tabl[9];
		tabl += 10;

		sample >>= (NUM_FRAC_BITS_PROTO - 1 - 2);	//-2 is for the -4 we need to multiply by :)
		sample = -sample;

		if(sample > OUT_CLIP_MAX) sample = OUT_CLIP_MAX;
		else if(sample < OUT_CLIP_MIN) sample = OUT_CLIP_MIN;
		s = sample;

		dst[j] = s;
	}
}

static void synth_8(OUTSAMPLE* dst, const INSAMPLE* src, FIXED* V){

	ITER i, j;
	const FIXED* tabl = proto_8_80;
	const FIXED* costab = costab_8;

	//shift
	for(i = 159; i >= 16; i--) V[i] = V[i - 16];

	//matrix
	for(i = 0; i < 16; i++){

		FIXED_S t =	(FIXED_S)costab[0] * (FIXED_S)src[0] +
				(FIXED_S)costab[1] * (FIXED_S)src[1] +
				(FIXED_S)costab[2] * (FIXED_S)src[2] +
				(FIXED_S)costab[3] * (FIXED_S)src[3] +
				(FIXED_S)costab[4] * (FIXED_S)src[4] +
				(FIXED_S)costab[5] * (FIXED_S)src[5] +
				(FIXED_S)costab[6] * (FIXED_S)src[6] +
				(FIXED_S)costab[7] * (FIXED_S)src[7];
		costab += 8;
                V[i] = t >> NUM_FRAC_BITS_COS;
	}

	//calculate audio samples
	for(j = 0; j < 8; j++){

		OUTSAMPLE s;
		FIXED_S sample =	(FIXED_S)V[j +  0] * (FIXED_S)tabl[0] +
					(FIXED_S)V[j + 24] * (FIXED_S)tabl[1] +
					(FIXED_S)V[j + 32] * (FIXED_S)tabl[2] +
					(FIXED_S)V[j + 56] * (FIXED_S)tabl[3] +
					(FIXED_S)V[j + 64] * (FIXED_S)tabl[4] +
					(FIXED_S)V[j + 88] * (FIXED_S)tabl[5] +
					(FIXED_S)V[j + 96] * (FIXED_S)tabl[6] +
					(FIXED_S)V[j +120] * (FIXED_S)tabl[7] +
					(FIXED_S)V[j +128] * (FIXED_S)tabl[8] +
					(FIXED_S)V[j +152] * (FIXED_S)tabl[9];
		tabl += 10;

		sample >>= (NUM_FRAC_BITS_PROTO - 1 - 3);	//-3 is for the -8 we need to multiply by :)
		sample = -sample;

		if(sample > OUT_CLIP_MAX) sample = OUT_CLIP_MAX;
		else if(sample < OUT_CLIP_MIN) sample = OUT_CLIP_MIN;
		s = sample;

		dst[j] = s;
	}
}

static void synth(OUTSAMPLE* dst, const INSAMPLE* src, uint8_t nBands, FIXED* V){  //A2DP sigure 12.3

	if(nBands == 4) synth_4(dst, src, V);
	else synth_8(dst, src, V);
}


/* original tables (these were reordered to be in order they are accessed)

proto_4_40:
	0.00000000E+00,5.36548976E-04,1.49188357E-03,2.73370904E-03,
	3.83720193E-03,3.89205149E-03,1.86581691E-03,-3.06012286E-03,
	1.09137620E-02,2.04385087E-02,2.88757392E-02,3.21939290E-02,
	2.58767811E-02,6.13245186E-03,-2.88217274E-02,-7.76463494E-02,
	1.35593274E-01,1.94987841E-01,2.46636662E-01,2.81828203E-01,
	2.94315332E-01,2.81828203E-01,2.46636662E-01,1.94987841E-01,
	-1.35593274E-01,-7.76463494E-02,-2.88217274E-02,6.13245186E-03,
	2.58767811E-02,3.21939290E-02,2.88757392E-02,2.04385087E-02,
	-1.09137620E-02,-3.06012286E-03,1.86581691E-03,3.89205149E-03,
	3.83720193E-03,2.73370904E-03,1.49188357E-03,5.36548976E-04

proto_8_80:

	0.00000000E+00,1.56575398E-04,3.43256425E-04,5.54620202E-04,
	8.23919506E-04,1.13992507E-03,1.47640169E-03,1.78371725E-03,
	2.01182542E-03,2.10371989E-03,1.99454554E-03,1.61656283E-03,
	9.02154502E-04,-1.78805361E-04,-1.64973098E-03,-3.49717454E-03,
	5.65949473E-03,8.02941163E-03,1.04584443E-02,1.27472335E-02,
	1.46525263E-02,1.59045603E-02,1.62208471E-02,1.53184106E-02,
	1.29371806E-02,8.85757540E-03,2.92408442E-03,-4.91578024E-03,
	-1.46404076E-02,-2.61098752E-02,-3.90751381E-02,-5.31873032E-02,
	6.79989431E-02,8.29847578E-02,9.75753918E-02,1.11196689E-01,
	1.23264548E-01,1.33264415E-01,1.40753505E-01,1.45389847E-01,
	1.46955068E-01,1.45389847E-01,1.40753505E-01,1.33264415E-01,
	1.23264548E-01,1.11196689E-01,9.75753918E-02,8.29847578E-02,
	-6.79989431E-02,-5.31873032E-02,-3.90751381E-02,-2.61098752E-02,
	-1.46404076E-02,-4.91578024E-03,2.92408442E-03,8.85757540E-03,
	1.29371806E-02,1.53184106E-02,1.62208471E-02,1.59045603E-02,
	1.46525263E-02,1.27472335E-02,1.04584443E-02,8.02941163E-03,
	-5.65949473E-03,-3.49717454E-03,-1.64973098E-03,-1.78805361E-04,
	9.02154502E-04,1.61656283E-03,1.99454554E-03,2.10371989E-03,
	2.01182542E-03,1.78371725E-03,1.47640169E-03,1.13992507E-03,
	8.23919506E-04,5.54620202E-04,3.43256425E-04,1.56575398E-04


js code to convert to fixpoint:

	var xa = new Array(values here...);

	var num = 0;
	var perRow = 4;
	var L = parseInt(xa.length);

	for(i = 0; i < L; i++){
		x = xa[i];

		var neg = 0;


		if(x < 0){
		  neg = 1;
		  x = -x;
		}
		x *= (1 << 26);   //this 26 should be the number of fraction bits
		x = parseInt(x + 0.5);
		s = x >> 28
		x &= 0x0FFFFFFF;
		if(neg){

			x = x ^ 0x0FFFFFFF;
			x++;
			s ^= 0x0F;
			if(x & 0x10000000) s++;
			x &= 0x0FFFFFFF;
			s &= 0x0F;
		}

		x = x.toString(16);
		while(x.length < 7) x = "0" + x;
		x = s.toString(16) + x;
		x = x.toUpperCase();


		document.write("0x" + x);
		if(i != L - 1) document.write(",");
		if(++num == perRow){
			num = 0;
			document.write("
");
		}
		else document.write(" ");
	}



js code to produce costab (adjust loop variables as needed for both table vairants)

	for(k = 0; k < 8; k++) for(i = 0 ;i < 4; i++){

		document.write(Math.cos((i + 0.5) * (k + 2) * Math.PI / 4) + ", ")
	}



js code to generate order tables (they are used for those strange offsets into the V array in synth_* when generating samples):

	var L = 200;
	var V = new Array(L);
	var U = new Array(80);
	var i, j;
	var nBands = 4;

	for(i = 0; i < L; i++) V[i] = i + 1;


	for(i = 0; i < 5; i++) for(j = 0; j < nBands; j++){

		if(nBands == 4){

			U[i * 8 + j] = V[i * 16 + j];
			U[i * 8 + 4 + j] = V[i * 16 + 12 + j];
		}
		else{

			U[i * 16 + j] = V[i * 32 + j];
			U[i * 16 + 8 + j] = V[i * 32 + 24 + j];
		}
	}


	for(j = 0; j < nBands; j++) for(i = 0; i < 10; i++) document.write((U[j + nBands * i] - 1) + ",\t");







C code for reordering the proto_* tables to access order. insert table values and modify nBands as needed

	#include < stdio.h>
	#include < stdint.h>



	int32_t tabl[] =
	{
		table data here
	};


	int main(int argc, char** argv){

		int i, j;
		int nBands = 8;

		for(j = 0; j < nBands; j++) for(i = 0; i < 10; i++) printf("0x%08X, ",tabl[j + nBands * i]);
	}

*/
That's the hard part of the decoder. The easy part is left as an exercise to the reader. Enjoy.